# Direct outputs eligible for public release

# GLOBAL SETTINGS --------------------------------------------------------------

options(
    scipen = 999,
    digits = 16,
    max.print = .Machine$integer.max,
    show.error.locations = TRUE,
    warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------
library(data.table)
library(Matrix)
library(zoo)
library(multiwayvcov)
library(lmtest)
library(checkmate)
library(futile.logger)
library(lfe)

library(remotes)
remotes::install_github("setzler/eventStudy/eventStudy")
library(eventStudy) # for DiD-IV

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# BEGIN FILE -------------------------------------------------------------------

# Just in case something masks 'source()', use extra deliberate base::source().
base::source("~/code/0-utility-functions/mathematical-utils.R", local = TRUE)
base::source("~/code/0-utility-functions/table-and-text-utils.R", local = TRUE)
base::source("~/code/0-utility-functions/wald_es.R", local = TRUE)

# READ IN IMBENS, RUBIN, AND SACERDOTE (2001) UNDERLYING DATA ------------------

# The dataset is taken from Li and Mueller (2021, Quantitative Economics)
# @source <https://qeconomics.org/ojs/index.php/qe/article/view/1577>
# @format A data.table with 496 rows and 77 variables
# \describe{
#   \item{yearlpr}{Yearly prize, in $1000 (time invariant in data); 0 denotes "nonwinners"} #nolint
#   \item{yearw}{Year won (time invariant in data)}
#   \item{tixbot}{Tickets bought (time invariant in data)}
#   \item{agew}{Age of the winner in the win year (time invariant in data)}
#   \item{agewgt55}{Binary indicator, agew is greater than 55 (time invariant in data)} #nolint
#   \item{agewgt65}{Binary indicator, agew is greater than 65 (time invariant in data)} #nolint
#   \item{male}{Binary indicator, sex of the winner is male (time invariant in data)} #nolint
#   \item{educ}{Years of schooling (time invariant in data)}
#   \item{college}{College attendance (time invariant in data)}
#   \item{collegedegree}{College degree (time invariant in data)}
#   \item{workthen}{Working then (time invariant in data)}
#   \item{xearn1}{Earnings in year -6, in $1000}
#   \item{xearn2}{Earnings in year -5, in $1000}
#   \item{xearn3}{Earnings in year -4, in $1000}
#   \item{xearn4}{Earnings in year -3, in $1000}
#   \item{xearn5}{Earnings in year -2, in $1000}
#   \item{xearn6}{Earnings in year -1, in $1000}
#   \item{yearn1}{Earnings in year 0, in $1000}
#   \item{yearn2}{Earnings in year +1, in $1000}
#   \item{yearn3}{Earnings in year +2, in $1000}
#   \item{yearn4}{Earnings in year +3, in $1000}
#   \item{yearn5}{Earnings in year +4, in $1000}
#   \item{yearn6}{Earnings in year +5, in $1000}
#   \item{yearn7}{Earnings in year +6, in $1000}
#   \item{xearnp1}{Positive earnings in year -6}
#   \item{xearnp2}{Positive earnings in year -5}
#   \item{xearnp3}{Positive earnings in year -4}
#   \item{xearnp4}{Positive earnings in year -3}
#   \item{xearnp5}{Positive earnings in year -2}
#   \item{xearnp6}{Positive earnings in year -1}
#   \item{yearnp1}{Positive earnings in year 0}
#   \item{yearnp2}{Positive earnings in year +1}
#   \item{yearnp3}{Positive earnings in year +2}
#   \item{yearnp4}{Positive earnings in year +3}
#   \item{yearnp5}{Positive earnings in year +4}
#   \item{yearnp6}{Positive earnings in year +5}
#   \item{yearnp7}{Positive earnings in year +6}
#   \item{winner}{Indicator for lottery winner}
#   \item{bigwinner}{Indicator for an annual prize of at least 100K}
# }

# dictionaries
irs2001_dict <-
    c(
        "yearlpr" = "Yearly prize",
        "yearnavg7" = "Average",
        "yearn1" = "Year 0",
        "yearn2" = "Year 1",
        "yearn3" = "Year 2",
        "yearn4" = "Year 3",
        "yearn5" = "Year 4",
        "yearn6" = "Year 5",
        "yearn7" = "Year 6",
        "yearnavg7_dif" = "Average",
        "yearn1-xearn6" = "Year 0",
        "yearn2-xearn6" = "Year 1",
        "yearn3-xearn6" = "Year 2",
        "yearn4-xearn6" = "Year 3",
        "yearn5-xearn6" = "Year 4",
        "yearn6-xearn6" = "Year 5",
        "yearn7-xearn6" = "Year 6"
    )

# read data
irs2001_sample <-
    readRDS("~/external-data/imbens-rubin-sacerdote-2001-data.rds")

irs2001_sample[, id_var := .I]

# type management -- some numeric type should be converted to integer
integer_vars <-
    c(
        "male",
        "workthen",
        "tixbot",
        "yearw",
        "agew",
        "winner",
        "educ",
        "xearnp1",
        "xearnp2",
        "xearnp3",
        "xearnp4",
        "xearnp5",
        "xearnp6",
        "yearnp1",
        "yearnp2",
        "yearnp3",
        "yearnp4",
        "yearnp5",
        "yearnp6",
        "yearnp7",
        "bigwinner",
        "college",
        "collegedegree",
        "agewgt45",
        "agewgt55",
        "agewgt65",
        "interaction_group",
        "interaction_group_dummies1",
        "interaction_group_dummies2",
        "interaction_group_dummies3",
        "interaction_group_dummies4",
        "interaction_group_dummies5",
        "interaction_group_dummies6",
        "interaction_group_dummies7",
        "interaction_group_dummies8",
        "interaction_group_dummies9",
        "interaction_group_dummies10",
        "interaction_group_dummies11",
        "interaction_group_dummies12",
        "interaction_group_dummies13",
        "interaction_group_dummies14",
        "interaction_group_dummies15",
        "interaction_group_dummies16",
        "obs",
        "ones"
    )

# Type conversion
for (int_col in integer_vars) {
    set(irs2001_sample,
        j = int_col,
        value = as.integer(irs2001_sample[[int_col]])
    )
}
int_col <- NULL
rm(int_col)

# Categorize two key IRS2001 estimation samples (taken directly from IRS2001)
# ==>   Full Winners Sample -- all winners in the IRS2001 with annual payouts
# ==>   Sample Excluding Large Winners -- subset of Full Winners Sample with
#       annual payout of < 100K
exclude_large_cond <-
    "as.integer(yearlpr > 0 & yearlpr < 100)" # nolint
irs2001_sample <- irs2001_sample[yearlpr > 0]
irs2001_sample[, exclude_large_sample := eval(parse(text = exclude_large_cond))]
exclude_large_cond <- NULL
rm(exclude_large_cond)

# CALCULATE MPE ESTIMATE USING OUR DID-IV ESTIMATOR ON THE IRS (2001) DATA -----
# ==> USE LATER WINNERS AS CONTROL GROUP
# Turn their wide data into long panel
# Double-check the positive earnings variables
print(irs2001_sample[as.integer(xearn1 > 0) != xearnp1])
print(irs2001_sample[as.integer(xearn2 > 0) != xearnp2])
print(irs2001_sample[as.integer(xearn3 > 0) != xearnp3])
print(irs2001_sample[as.integer(xearn4 > 0) != xearnp4])
print(irs2001_sample[as.integer(xearn5 > 0) != xearnp5])
print(irs2001_sample[as.integer(xearn6 > 0) != xearnp6])
print(irs2001_sample[as.integer(yearn1 > 0) != yearnp1])
print(irs2001_sample[as.integer(yearn2 > 0) != yearnp2])
print(irs2001_sample[as.integer(yearn3 > 0) != yearnp3])
print(irs2001_sample[as.integer(yearn4 > 0) != yearnp4])
print(irs2001_sample[as.integer(yearn5 > 0) != yearnp5])
print(irs2001_sample[as.integer(yearn6 > 0) != yearnp6])
print(irs2001_sample[as.integer(yearn7 > 0) != yearnp7])
# all check out; *earnp* variables effectively redundant

# First, we restructure their data from wide to long
irs2001_sample_panel <-
    irs2001_sample[, list(
        id_var,
        yearw,
        agew,
        exclude_large_sample,
        yearlpr,
        xearn1,
        xearn2,
        xearn3,
        xearn4,
        xearn5,
        xearn6,
        yearn1,
        yearn2,
        yearn3,
        yearn4,
        yearn5,
        yearn6,
        yearn7
    )]
gc()

time_invariant_vars <-
    c(
        "id_var",
        "yearw",
        "agew",
        "exclude_large_sample",
        "yearlpr"
    )
time_varying_vars <-
    c(
        "xearn1",
        "xearn2",
        "xearn3",
        "xearn4",
        "xearn5",
        "xearn6",
        "yearn1",
        "yearn2",
        "yearn3",
        "yearn4",
        "yearn5",
        "yearn6",
        "yearn7"
    )

irs2001_sample_panel <-
    melt(irs2001_sample_panel,
        id.vars = time_invariant_vars,
        measure.vars = time_varying_vars
    )
setnames(irs2001_sample_panel, "value", "earnings")

# Use information on win year (yearw) and event time (in xearn1, yearn1, etc)
# to fill in the:
# - calendar time
# - age
# - time-varying yearlpr (from 0 to yearlpr in yearw)

# NOTE:
# - xearn6 corresponds to year -1
# - xearn1 corresponds to -6
# - yearn1 corresponds to 0
# => so approach below accounts for this numbering

irs2001_sample_panel[, variable := as.character(variable)]
irs2001_sample_panel[
    grepl("x", variable),
    event_time := (as.integer(extr_right_substr(
        string = variable,
        char = 1L
    )) - 7L)
] # pre
irs2001_sample_panel[
    grepl("y", variable),
    event_time := as.integer(extr_right_substr(
        string = variable,
        char = 1L
    )) - 1L
] # post
irs2001_sample_panel[, calendar_year := yearw + event_time]
irs2001_sample_panel[, age := agew + event_time]
irs2001_sample_panel[, income_change := 0]
irs2001_sample_panel[
    yearlpr > 0 & event_time > -1,
    income_change := yearlpr
]

irs2001_sample_panel <-
    irs2001_sample_panel[
        ,
        .(
            id_var,
            calendar_year,
            yearw,
            exclude_large_sample,
            yearlpr,
            income_change,
            event_time,
            age,
            earnings
        )
    ]

# We match our specification exactly
# - so use the same aged 21-64 inclusion criterion
setorderv(irs2001_sample_panel, c("id_var", "calendar_year"))
irs2001_sample_panel[
    ,
    age_case := as.logical(between(age, 21, 64, incbounds = TRUE))
]
irs2001_sample_panel[, yearw_forWald := yearw]
irs2001_excl_large_sample_panel <-
    irs2001_sample_panel[exclude_large_sample == 1]

# Tables supplying instructions for aggregated results
# With later winners, have restricted set of event times (after exploring)
collapse_table_laterwinners <-
    data.table(
        a = c("one_to_three"),
        b = c(list(1:3))
    )

did_iv_mpes_irs2001 <- list()

setorderv(irs2001_sample_panel, c("id_var", "calendar_year"))
did_iv_mpes_irs2001[[1]] <-
    Wald_ES2(
        long_data = copy(irs2001_sample_panel),
        outcomevar = "earnings",
        unit_var = "id_var",
        cal_time_var = "calendar_year",
        onset_time_var = "yearw_forWald",
        cluster_vars = "id_var",
        cohort_by_cohort = FALSE,
        omitted_event_time = -2L,
        discrete_covars = "age",
        control_subset_var = "age_case",
        control_subset_event_time = 0,
        treated_subset_var = "age_case",
        treated_subset_event_time = 0,
        endog_var = "income_change",
        calculate_collapse_estimates = TRUE,
        collapse_inputs = collapse_table_laterwinners,
        heterogeneous_only = TRUE,
        anticipation = 0,
        case = "balanced_in_event_time"
    )[[1]][
        ref_onset_time == "Cohort-Weighted + Collapsed" &
            model == "ratio",
        .(
            estimate = estimate / 0.9, # converting to MPE, as in IRS (2001)
            se = cluster_se
        )
    ]
did_iv_mpes_irs2001[[1]][, sample := "Full Winners Sample"]
did_iv_mpes_irs2001[[1]][, estimate_type := "MPE"]

setorderv(irs2001_excl_large_sample_panel, c("id_var", "calendar_year"))
did_iv_mpes_irs2001[[2]] <-
    Wald_ES2(
        long_data = copy(irs2001_excl_large_sample_panel),
        outcomevar = "earnings",
        unit_var = "id_var",
        cal_time_var = "calendar_year",
        onset_time_var = "yearw_forWald",
        cluster_vars = "id_var",
        cohort_by_cohort = FALSE,
        omitted_event_time = -2L,
        discrete_covars = "age",
        control_subset_var = "age_case",
        control_subset_event_time = 0,
        treated_subset_var = "age_case",
        treated_subset_event_time = 0,
        endog_var = "income_change",
        calculate_collapse_estimates = TRUE,
        collapse_inputs = collapse_table_laterwinners,
        heterogeneous_only = TRUE,
        anticipation = 0,
        case = "balanced_in_event_time"
    )[[1]][
        ref_onset_time == "Cohort-Weighted + Collapsed" &
            model == "ratio",
        .(
            estimate = estimate / 0.9, # converting to MPE, as in IRS (2001)
            se = cluster_se
        )
    ]
did_iv_mpes_irs2001[[2]][, sample := "Sample Excluding Large Winners"]
did_iv_mpes_irs2001[[2]][, estimate_type := "MPE"]

did_iv_mpes_irs2001 <- rbindlist(did_iv_mpes_irs2001, use.names = TRUE)

# Housekeeping
irs2001_dict <- NULL
irs2001_sample <- NULL
integer_vars <- NULL
time_invariant_vars <- NULL
time_varying_vars <- NULL
irs2001_sample_panel <- NULL
irs2001_excl_large_sample_panel <- NULL
collapse_table_laterwinners <- NULL
rm(
    irs2001_dict,
    irs2001_sample,
    integer_vars,
    time_invariant_vars,
    time_varying_vars,
    irs2001_sample_panel,
    irs2001_excl_large_sample_panel,
    collapse_table_laterwinners
)

# MAKE TABLE -------------------------------------------------------------------

filedir <- "~/paper/tables/table-A.14.tex"
file.create(filedir)

# Start Table A.14
start_latex_table(
    column_structure = "lccccc",
    file_dir = filedir,
    append = TRUE
)
add_latex_table_rule(rule_type = "toprule", file_dir = filedir, append = TRUE)
add_latex_table_rule(rule_type = "midrule", file_dir = filedir, append = TRUE)

panel_a_heading_input1a <-
    " & \\multicolumn{2}{c}{FD Estimator} & & \\multicolumn{2}{c}{DiD-IV Estimator}" # nolint
write_latex_table_row(
    panel_a_heading_input1a,
    file_dir = filedir,
    append = TRUE
)
add_latex_cmidrule(
    point_start_col = "l",
    point_end_col = "r",
    start_col = 2,
    end_col = 3,
    file_dir = filedir,
    append = TRUE
)
add_latex_cmidrule(
    point_start_col = "l",
    point_end_col = "r",
    start_col = 5,
    end_col = 6,
    file_dir = filedir,
    append = TRUE
)

panel_a_heading_input1b <-
    " & Full Winners Sample & Sample Excluding Large Winners & & Full Winners Sample & Sample Excluding Large Winners" # nolint
write_latex_table_row(
    panel_a_heading_input1b,
    file_dir = filedir,
    append = TRUE
)

panel_a_heading_input1b <-
    " & (1) & (2) & & (3) & (4)" # nolint
write_latex_table_row(
    panel_a_heading_input1b,
    file_dir = filedir,
    append = TRUE
)

add_latex_table_rule(rule_type = "midrule", file_dir = filedir, append = TRUE)

# Add body of Table A.14
# - introduce convenience functions given a similar structure throughout

TableA14Row1 <- function(dt) {
    dt_temp <- copy(dt)
    dt_input <-
        sprintf(
            "Effect Estimate & -0.048 & -0.112 & & %s & %s",
            fmt_fixed_decimal(
                input = dt_temp[sample == "Full Winners Sample"]$estimate,
                sub_for_missing = "NA",
                should_be_rounded = TRUE,
                round_digits = 3,
                decimal_digits = 3,
                scale_factor = NA,
                add_comma = TRUE
            ),
            fmt_fixed_decimal(
                input = dt_temp[sample == "Sample Excluding Large Winners"]$estimate, # nolint
                sub_for_missing = "NA",
                should_be_rounded = TRUE,
                round_digits = 3,
                decimal_digits = 3,
                scale_factor = NA,
                add_comma = TRUE
            )
        )
    write_latex_table_row(
        dt_input,
        file_dir = filedir,
        append = TRUE
    )
    invisible(dt_input)
}

TableA14Row2 <- function(dt) {
    dt_temp <- copy(dt)
    dt_input <-
        sprintf(
            "(SE) & (0.010) & (0.029) & & (%s) & (%s)",
            fmt_fixed_decimal(
                input = dt_temp[sample == "Full Winners Sample"]$se,
                sub_for_missing = "NA",
                should_be_rounded = TRUE,
                round_digits = 3,
                decimal_digits = 3,
                scale_factor = NA,
                add_comma = TRUE
            ),
            fmt_fixed_decimal(
                input = dt_temp[sample == "Sample Excluding Large Winners"]$se, # nolint
                sub_for_missing = "NA",
                should_be_rounded = TRUE,
                round_digits = 3,
                decimal_digits = 3,
                scale_factor = NA,
                add_comma = TRUE
            )
        )
    write_latex_table_row(
        dt_input,
        file_dir = filedir,
        append = TRUE
    )
    invisible(dt_input)
}

TableA14Row1(dt = did_iv_mpes_irs2001)
TableA14Row2(dt = did_iv_mpes_irs2001)

# End Table A.14
add_latex_table_rule(
    rule_type = "bottomrule",
    file_dir = filedir,
    append = TRUE
)
end_latex_table(file_dir = filedir, append = TRUE)
filedir <- NULL
rm(filedir)

# Housekeeping
did_iv_mpes_irs2001 <- NULL
panel_a_heading_input1a <- NULL
panel_a_heading_input1b <- NULL
rm(did_iv_mpes_irs2001, panel_a_heading_input1a, panel_a_heading_input1b)